# Load necessary packages
import os
import numpy as np
import pandas as pd
import nltk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from kneed import KneeLocator
import matplotlib.pyplot as plt
import gensim.downloader as api
import re
from sklearn.decomposition import PCA
import random

# Set Seed for Reproducibility
random.seed(1996)
#############################
# Load Data
df=pd.read_csv('model_text_emphasis.csv')
# load pretrained word2vec embeddings
wv = api.load('word2vec-google-news-300')
### Estimation for Model in Paper ###
# Isolate issue statements for inputting into model
df_text = df['issue_final']
tag = df['candidate_id']

# Create tagged data  for document vectors
tagged_data = []
for i in range(0, len(df)):
    string = str(df_text[i])
    cand_tag = [str(tag[i])]
    tagged_data.append(TaggedDocument(words = word_tokenize(string), tags = cand_tag))
# Build model
model = Doc2Vec(dm = 0, dbow_words = 1, 
                vector_size=300, window=6, min_count=5, 
                workers=1, epochs=20, seed = 1996)
model.build_vocab(tagged_data)
model.wv.vectors = wv.vectors
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Create list with no duplicates 
candidates = tag.drop_duplicates()
# Isolate Document Vectors 
M = model.vector_size
P = len(candidates)
z = np.zeros((P,M))

# Use a for loop to populate the matrix for each candidate-year combination
for i in range(P):
    z[i,:] = model.dv[i]

# Isolate Web Scores and Candidate ID
pca_candidate = PCA(n_components = 1)
Zcand = pd.DataFrame(pca_candidate.fit_transform(z), columns = ['web_score'])
candidates_fixed = candidates.reset_index()
Zcand['candidate_id'] = candidates_fixed['candidate_id']

Zcand.to_csv('webscore_estimates_emphasis.csv', index = False)



